knitr::opts_chunk$set(fig.align="center")
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr)
library(ggplot2)
library(magrittr)
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)
theme_set(theme_light())
source('helper_functions.R')
In our experiment, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then asked the user to evaluate the tool on a variety of metrics (confidence in understanding data, confidence in answer, efficiency, ease of use, utility, and overall).
Given a search algorithm (bfs or dfs), an oracle (compassql or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s average score for a given metric. In addition, we would like to know if the choice of search algorithm and oracle has any meaningful impact on a user’s ratong for these metrics.
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
confidence_metrics = c("confidence.udata", "confidence.ans")
preference_metrics = c("efficiency", "ease.of.use", "utility", "overall")
user_response_data <- read.csv('split_by_participant_groups/ptask_responses.csv')
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
user_response_data[,analyses] <- lapply(user_response_data[,analyses],ordered)
user_response_data <- user_response_data %>%
mutate(
dataset = as.factor(dataset),
oracle = as.factor(oracle),
search = as.factor(search),
task = as.factor(task)
)
models <- list()
search_differences <- list()
oracle_differences <- list()
alg_differences <- list()
participant_group_differences <- list()
seed = 12
models$confidence_udata <- brm(
formula = bf(confidence.udata ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/confidence_udata",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$confidence_udata)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: confidence.udata ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.04 0.16 0.75 1.38 1.00 972 1624
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.01 0.39 -2.80 -1.26 1.00 1608
## Intercept[2] -0.84 0.37 -1.61 -0.13 1.00 1563
## Intercept[3] 1.33 0.37 0.59 2.06 1.00 1565
## datasetmovies 0.10 0.30 -0.48 0.70 1.00 1359
## oracledziban 0.06 0.44 -0.80 0.89 1.00 1204
## searchdfs -0.42 0.43 -1.27 0.40 1.00 1272
## task2.RetrieveValue 0.29 0.21 -0.11 0.70 1.00 3249
## task3.Prediction 0.16 0.21 -0.23 0.56 1.00 3486
## task4.Exploration 0.61 0.21 0.19 1.01 1.00 3318
## oracledziban:searchdfs 0.70 0.60 -0.47 1.89 1.00 1232
## Tail_ESS
## Intercept[1] 2093
## Intercept[2] 2149
## Intercept[3] 2240
## datasetmovies 1429
## oracledziban 1403
## searchdfs 1577
## task2.RetrieveValue 2601
## task3.Prediction 2538
## task4.Exploration 2197
## oracledziban:searchdfs 1304
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$confidence_udata)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$confidence_udata,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$confidence_udata,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for confidence in understanding the data using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
confidence_udata_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_udata, NULL, "Oracle/Search Combination", "Rating")
confidence_udata_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
confidence_udata_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 1.01 0.688 1.33 0.95 mean qi
## 2 bfs dziban 1.05 0.721 1.34 0.95 mean qi
## 3 dfs compassql 0.804 0.426 1.13 0.95 mean qi
## 4 dfs dziban 1.17 0.859 1.48 0.95 mean qi
## 5 bfs compassql 1.01 0.906 1.12 0.5 mean qi
## 6 bfs dziban 1.05 0.941 1.15 0.5 mean qi
## 7 dfs compassql 0.804 0.691 0.926 0.5 mean qi
## 8 dfs dziban 1.17 1.06 1.28 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
confidence_udata_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_udata, seed = seed, re_formula = NA)
confidence_udata_predictive_data$alg <- paste(confidence_udata_predictive_data$search, confidence_udata_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$confidence_udata <- user_response_expected_diff_in_mean_plot(confidence_udata_predictive_data, "search", "confidence.udata", "Difference in Average Confidence in Understanding Data (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$confidence_udata$plot
Differences in user score by oracle.
oracle_differences$confidence_udata <- user_response_expected_diff_in_mean_plot(confidence_udata_predictive_data, "oracle", "confidence.udata", "Difference in Average Confidence in Understanding Data (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$confidence_udata$plot
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
confidence_udata_predictive_data_subset <- subset(confidence_udata_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$confidence_udata <- user_response_expected_diff_in_mean_plot(confidence_udata_predictive_data_subset, "alg", "confidence.udata", "Difference in Average Confidence in Understanding Data (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$confidence_udata$plot
Differences in user score by participant group
participant_group_differences$confidence_udata <- user_response_expected_diff_in_mean_plot(confidence_udata_predictive_data, "participant_group", "confidence.udata", "Difference in Average Confidence in Understanding Data (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$confidence_udata$plot
models$confidence_ans <- brm(
formula = bf(confidence.ans ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/confidence_ans",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$confidence_ans)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: confidence.ans ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.55 0.14 0.27 0.82 1.00 744 662
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -3.24 0.38 -4.03 -2.49 1.00 2489
## Intercept[2] -2.41 0.29 -2.99 -1.86 1.00 2552
## Intercept[3] -1.57 0.27 -2.11 -1.06 1.00 2354
## Intercept[4] 0.17 0.25 -0.32 0.67 1.00 2519
## datasetmovies -0.16 0.19 -0.55 0.23 1.00 2629
## oracledziban 0.24 0.28 -0.31 0.79 1.00 2084
## searchdfs 0.12 0.28 -0.41 0.66 1.00 2064
## task2.RetrieveValue -0.30 0.21 -0.69 0.11 1.00 2839
## task3.Prediction -1.04 0.20 -1.44 -0.64 1.00 2829
## task4.Exploration -0.62 0.20 -1.01 -0.23 1.00 2993
## oracledziban:searchdfs -0.02 0.39 -0.79 0.81 1.00 1992
## Tail_ESS
## Intercept[1] 2504
## Intercept[2] 2276
## Intercept[3] 2530
## Intercept[4] 2583
## datasetmovies 2500
## oracledziban 2087
## searchdfs 2504
## task2.RetrieveValue 2363
## task3.Prediction 2476
## task4.Exploration 2197
## oracledziban:searchdfs 1885
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$confidence_ans)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$confidence_ans,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$confidence_ans,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for confidence in answer using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
confidence_ans_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_ans, NULL, "Oracle/Search Combination", "Rating")
confidence_ans_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
confidence_ans_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 1.01 0.688 1.30 0.95 mean qi
## 2 bfs dziban 1.16 0.853 1.44 0.95 mean qi
## 3 dfs compassql 1.08 0.765 1.37 0.95 mean qi
## 4 dfs dziban 1.22 0.922 1.5 0.95 mean qi
## 5 bfs compassql 1.01 0.906 1.12 0.5 mean qi
## 6 bfs dziban 1.16 1.06 1.25 0.5 mean qi
## 7 dfs compassql 1.08 0.985 1.19 0.5 mean qi
## 8 dfs dziban 1.22 1.12 1.31 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
confidence_ans_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_ans, seed = seed, re_formula = NA)
confidence_ans_predictive_data$alg <- paste(confidence_ans_predictive_data$search, confidence_ans_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$confidence_ans <- user_response_expected_diff_in_mean_plot(confidence_ans_predictive_data, "search", "confidence.ans", "Difference in Average Confidence in Answer (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$confidence_ans$plot
Differences in user score by oracle.
oracle_differences$confidence_ans <- user_response_expected_diff_in_mean_plot(confidence_ans_predictive_data, "oracle", "confidence.ans", "Difference in Average Confidence in Answer (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$confidence_ans$plot
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
confidence_ans_predictive_data_subset <- subset(confidence_ans_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$confidence_ans <- user_response_expected_diff_in_mean_plot(confidence_ans_predictive_data_subset, "alg", "confidence.ans", "Difference in Average Confidence in Answer (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$confidence_ans$plot
Differences in user score by participant group
participant_group_differences$confidence_ans <- user_response_expected_diff_in_mean_plot(confidence_ans_predictive_data, "participant_group", "confidence.ans", "Difference in Average Confidence in Answer (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$confidence_ans$plot
filename = "efficiency"
models$efficiency <- brm(
formula = bf(efficiency ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/efficiency",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$efficiency)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: efficiency ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.16 0.16 0.88 1.50 1.00 1027 1369
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.58 0.43 -3.49 -1.76 1.00 1163
## Intercept[2] -1.01 0.39 -1.81 -0.26 1.00 1002
## Intercept[3] -0.14 0.39 -0.91 0.64 1.00 1003
## Intercept[4] 1.07 0.40 0.29 1.86 1.00 1007
## datasetmovies 0.31 0.32 -0.31 0.95 1.00 1093
## oracledziban -0.15 0.46 -1.03 0.74 1.00 969
## searchdfs -1.21 0.47 -2.17 -0.31 1.00 1021
## task2.RetrieveValue -0.27 0.19 -0.64 0.12 1.00 3059
## task3.Prediction 0.27 0.19 -0.10 0.65 1.00 3024
## task4.Exploration 0.46 0.20 0.08 0.85 1.00 3251
## oracledziban:searchdfs 0.86 0.64 -0.43 2.15 1.00 978
## Tail_ESS
## Intercept[1] 1461
## Intercept[2] 1361
## Intercept[3] 1337
## Intercept[4] 1307
## datasetmovies 1658
## oracledziban 1427
## searchdfs 1526
## task2.RetrieveValue 2384
## task3.Prediction 2162
## task4.Exploration 2232
## oracledziban:searchdfs 1427
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$efficiency)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$efficiency,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$efficiency,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for efficiency using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
efficiency_plot <- user_response_posterior_draws_plot(user_response_data, models$efficiency, NULL, "Oracle/Search Combination", "Rating")
efficiency_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
efficiency_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.745 0.172 1.27 0.95 mean qi
## 2 bfs dziban 0.635 0.0882 1.16 0.95 mean qi
## 3 dfs compassql -0.268 -0.809 0.309 0.95 mean qi
## 4 dfs dziban 0.324 -0.281 0.906 0.95 mean qi
## 5 bfs compassql 0.745 0.562 0.953 0.5 mean qi
## 6 bfs dziban 0.635 0.441 0.824 0.5 mean qi
## 7 dfs compassql -0.268 -0.456 -0.0735 0.5 mean qi
## 8 dfs dziban 0.324 0.125 0.516 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
efficiency_predictive_data <- user_response_data %>% add_predicted_draws(models$efficiency, seed = seed, re_formula = NA)
efficiency_predictive_data$alg <- paste(efficiency_predictive_data$search, efficiency_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$efficiency <- user_response_expected_diff_in_mean_plot(efficiency_predictive_data, "search", "efficiency", "Difference in Average Efficiency (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$efficiency$plot
Differences in user score by oracle.
oracle_differences$efficiency <- user_response_expected_diff_in_mean_plot(efficiency_predictive_data, "oracle", "efficiency", "Difference in Average Efficiency (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$efficiency$plot
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
efficiency_predictive_data_data_subset <- subset(efficiency_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$efficiency <- user_response_expected_diff_in_mean_plot(efficiency_predictive_data_data_subset, "alg", "efficiency", "Difference in Average Efficiency (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$efficiency$plot
Differences in user score by participant group
participant_group_differences$efficiency <- user_response_expected_diff_in_mean_plot(efficiency_predictive_data, "participant_group", "efficiency", "Difference in Average Efficiency (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$efficiency$plot
models$ease_of_use <- brm(
formula = bf(ease.of.use ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/ease_of_use",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$ease_of_use)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: ease.of.use ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.06 0.15 0.79 1.40 1.01 694 1152
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.77 0.43 -3.69 -1.96 1.00 1182
## Intercept[2] -1.36 0.38 -2.14 -0.61 1.00 1097
## Intercept[3] -0.43 0.37 -1.15 0.33 1.00 1068
## Intercept[4] 1.44 0.39 0.70 2.22 1.00 1115
## datasetmovies 0.48 0.31 -0.12 1.07 1.00 913
## oracledziban -0.37 0.44 -1.26 0.49 1.00 847
## searchdfs -1.26 0.43 -2.13 -0.43 1.00 900
## task2.RetrieveValue 0.20 0.19 -0.18 0.58 1.00 3202
## task3.Prediction 0.35 0.20 -0.04 0.75 1.00 2983
## task4.Exploration 0.43 0.20 0.04 0.84 1.00 3215
## oracledziban:searchdfs 0.86 0.61 -0.34 2.04 1.01 812
## Tail_ESS
## Intercept[1] 1507
## Intercept[2] 1607
## Intercept[3] 1548
## Intercept[4] 1877
## datasetmovies 1606
## oracledziban 1166
## searchdfs 1163
## task2.RetrieveValue 2500
## task3.Prediction 2034
## task4.Exploration 2134
## oracledziban:searchdfs 1213
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$ease_of_use)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$ease_of_use,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$ease_of_use,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for ease of use using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
ease_of_use_plot <- user_response_posterior_draws_plot(user_response_data, models$ease_of_use, NULL, "Oracle/Search Combination", "Rating")
ease_of_use_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
ease_of_use_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.940 0.516 1.31 0.95 mean qi
## 2 bfs dziban 0.722 0.279 1.10 0.95 mean qi
## 3 dfs compassql 0.0744 -0.397 0.559 0.95 mean qi
## 4 dfs dziban 0.432 -0.0781 0.875 0.95 mean qi
## 5 bfs compassql 0.940 0.812 1.08 0.5 mean qi
## 6 bfs dziban 0.722 0.588 0.868 0.5 mean qi
## 7 dfs compassql 0.0744 -0.0882 0.25 0.5 mean qi
## 8 dfs dziban 0.432 0.281 0.594 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
ease_of_use_predictive_data <- user_response_data %>% add_predicted_draws(models$ease_of_use, seed = seed, re_formula = NA)
ease_of_use_predictive_data$alg <- paste(ease_of_use_predictive_data$search, ease_of_use_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$ease_of_use <- user_response_expected_diff_in_mean_plot(ease_of_use_predictive_data, "search", "ease.of.use", "Difference in Average Ease of Use (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$ease_of_use$plot
Differences in user score by oracle.
oracle_differences$ease_of_use <- user_response_expected_diff_in_mean_plot(ease_of_use_predictive_data, "oracle", "ease.of.use", "Difference in Average Ease of Use (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$ease_of_use$plot
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
ease_of_use_predictive_data_subset <- subset(ease_of_use_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$ease_of_use <- user_response_expected_diff_in_mean_plot(ease_of_use_predictive_data_subset, "alg", "ease.of.use", "Difference in Average Ease of Use (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$ease_of_use$plot
Differences in user score by participant group
participant_group_differences$ease_of_use <- user_response_expected_diff_in_mean_plot(ease_of_use_predictive_data, "participant_group", "ease.of.use", "Difference in Average Ease of Use (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$ease_of_use$plot
models$utility <- brm(
formula = bf(utility ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/utility",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$utility)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: utility ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.97 0.14 0.71 1.27 1.01 768 1528
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -1.79 0.37 -2.52 -1.07 1.00 1347
## Intercept[2] -0.54 0.35 -1.23 0.14 1.00 1408
## Intercept[3] 0.11 0.35 -0.58 0.80 1.00 1392
## Intercept[4] 1.41 0.36 0.71 2.13 1.00 1466
## datasetmovies 0.38 0.28 -0.15 0.93 1.00 1375
## oracledziban 0.06 0.39 -0.74 0.82 1.00 1186
## searchdfs -0.83 0.39 -1.58 -0.07 1.00 1228
## task2.RetrieveValue -0.15 0.19 -0.53 0.21 1.00 3553
## task3.Prediction 0.34 0.19 -0.03 0.72 1.00 3373
## task4.Exploration 0.65 0.19 0.27 1.03 1.00 3269
## oracledziban:searchdfs 0.52 0.57 -0.55 1.65 1.00 1107
## Tail_ESS
## Intercept[1] 1805
## Intercept[2] 1980
## Intercept[3] 1957
## Intercept[4] 2029
## datasetmovies 1771
## oracledziban 1610
## searchdfs 1708
## task2.RetrieveValue 1934
## task3.Prediction 2623
## task4.Exploration 2692
## oracledziban:searchdfs 1526
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$utility)
s plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$utility,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$utility,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for Utility using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
utility_plot <- user_response_posterior_draws_plot(user_response_data, models$utility, NULL, "Oracle/Search Combination", "Rating")
utility_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
utility_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.558 -0.0156 1.08 0.95 mean qi
## 2 bfs dziban 0.620 0.0588 1.12 0.95 mean qi
## 3 dfs compassql -0.200 -0.721 0.353 0.95 mean qi
## 4 dfs dziban 0.329 -0.25 0.859 0.95 mean qi
## 5 bfs compassql 0.558 0.375 0.75 0.5 mean qi
## 6 bfs dziban 0.620 0.441 0.809 0.5 mean qi
## 7 dfs compassql -0.200 -0.397 -0.0147 0.5 mean qi
## 8 dfs dziban 0.329 0.141 0.516 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
utility_predictive_data <- user_response_data %>% add_predicted_draws(models$utility, seed = seed, re_formula = NA)
utility_predictive_data$alg <- paste(utility_predictive_data$search, utility_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$utility <- user_response_expected_diff_in_mean_plot(utility_predictive_data, "search", "utility", "Difference in Average Utility (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$utility$plot
Differences in user score by oracle.
oracle_differences$utility <- user_response_expected_diff_in_mean_plot(utility_predictive_data, "oracle", "utility", "Difference in Average Utility (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$utility$plot
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
utility_predictive_data_subset <- subset(utility_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$utility <- user_response_expected_diff_in_mean_plot(utility_predictive_data_subset, "alg", "utility", "Difference in Average Utility (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$utility$plot
Differences in user score by participant group
participant_group_differences$utility <- user_response_expected_diff_in_mean_plot(utility_predictive_data, "participant_group", "utility", "Difference in Average Utility (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$utility$plot
models$overall <- brm(
formula = bf(overall ~ dataset + oracle * search + task + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/overall",
seed = seed
)
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$overall)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: overall ~ dataset + oracle * search + task + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.49 0.20 1.13 1.93 1.00 712 1065
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1] -2.94 0.52 -4.00 -1.94 1.00 667
## Intercept[2] -1.58 0.49 -2.54 -0.64 1.00 620
## Intercept[3] -0.31 0.48 -1.25 0.64 1.00 600
## Intercept[4] 1.84 0.50 0.88 2.83 1.00 608
## datasetmovies -0.05 0.40 -0.86 0.72 1.00 396
## oracledziban 0.06 0.56 -1.04 1.19 1.00 462
## searchdfs -0.85 0.56 -1.92 0.24 1.00 467
## task2.RetrieveValue -0.02 0.20 -0.41 0.38 1.00 2167
## task3.Prediction 0.43 0.20 0.03 0.82 1.00 2125
## task4.Exploration 0.66 0.21 0.24 1.07 1.00 2149
## oracledziban:searchdfs 0.56 0.78 -0.96 2.06 1.00 472
## Tail_ESS
## Intercept[1] 1177
## Intercept[2] 1123
## Intercept[3] 1082
## Intercept[4] 1052
## datasetmovies 1001
## oracledziban 780
## searchdfs 992
## task2.RetrieveValue 2138
## task3.Prediction 2355
## task4.Exploration 1898
## oracledziban:searchdfs 884
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
plot(models$overall)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$overall,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$overall,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a average response for Overall using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
overall_plot <- user_response_posterior_draws_plot(user_response_data, models$overall, NULL, "Oracle/Search Combination", "Rating")
overall_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
overall_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql 0.707 0.203 1.12 0.95 mean qi
## 2 bfs dziban 0.742 0.279 1.13 0.95 mean qi
## 3 dfs compassql 0.196 -0.353 0.721 0.95 mean qi
## 4 dfs dziban 0.580 0.0469 1 0.95 mean qi
## 5 bfs compassql 0.707 0.562 0.875 0.5 mean qi
## 6 bfs dziban 0.742 0.618 0.882 0.5 mean qi
## 7 dfs compassql 0.196 0 0.397 0.5 mean qi
## 8 dfs dziban 0.580 0.422 0.75 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
overall_predictive_data <- user_response_data %>% add_predicted_draws(models$overall, seed = seed, re_formula = NA)
overall_predictive_data$alg <- paste(overall_predictive_data$search, overall_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$overall <- user_response_expected_diff_in_mean_plot(overall_predictive_data, "search", "overall", "Difference in Average Overall (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$overall$plot
Differences in user score by oracle.
oracle_differences$overall <- overall_predictive_data %>%
group_by(oracle, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = oracle) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$metric = "overall"
oracle_differences$overall %>%
ggplot(aes(x = diff_in_rating, y = "overall")) +
xlab(paste0("Expected Difference in Rating (",oracle_differences$overall[1,'oracle'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
oracle_differences$overall <- user_response_expected_diff_in_mean_plot(overall_predictive_data, "oracle", "overall", "Difference in Average Overall (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$plot
Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)
overall_predictive_data_subset <- subset(overall_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$overall <- user_response_expected_diff_in_mean_plot(overall_predictive_data_subset, "alg", "overall", "Difference in Average Overall (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$overall$plot
Differences in user score by participant group
participant_group_differences$overall <- user_response_expected_diff_in_mean_plot(overall_predictive_data, "participant_group", "overall", "Difference in Average Overall (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$overall$plot
Putting the all of the plots for search algorithm and oracle differences together, split by whether the rating metric is of type confidence or preference We’ll start with differences in search algorithms.
combined_search_differences <- rbind(
search_differences$confidence_udata$differences,
search_differences$confidence_ans$differences,
search_differences$efficiency$differences,
search_differences$ease_of_use$differences,
search_differences$utility$differences,
search_differences$overall$differences)
combined_search_differences$metric <- factor(combined_search_differences$metric, levels=rev(analyses))
combined_search_differences_confidence <- subset(combined_search_differences, metric %in% confidence_metrics)
search_differences_plot_confidence <- combined_search_differences_confidence %>%
ggplot(aes(x = difference, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_search_differences_confidence[1,'search'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
search_differences_plot_confidence
View intervals
fit_info_search_differences_confidence <- combined_search_differences_confidence %>% group_by(search, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_search_differences_confidence
## # A tibble: 4 x 8
## # Groups: search [1]
## search metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs - dfs confidence.ans -0.0625 -0.364 0.235 0.95 mean qi
## 2 bfs - dfs confidence.udata 0.0465 -0.273 0.371 0.95 mean qi
## 3 bfs - dfs confidence.ans -0.0625 -0.167 0.0455 0.5 mean qi
## 4 bfs - dfs confidence.udata 0.0465 -0.0682 0.159 0.5 mean qi
combined_search_differences_preference <- subset(combined_search_differences, metric %in% preference_metrics)
search_differences_plot_preference <- combined_search_differences_preference %>%
ggplot(aes(x = difference, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_search_differences_preference[1,'search'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
search_differences_plot_preference
View intervals
fit_info_search_differences_preference <- combined_search_differences_preference %>% group_by(search, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_search_differences_preference
## # A tibble: 8 x 8
## # Groups: search [1]
## search metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs - dfs overall 0.343 -0.144 0.826 0.95 mean qi
## 2 bfs - dfs utility 0.533 -0.0227 1.08 0.95 mean qi
## 3 bfs - dfs ease.of.use 0.580 0.152 1.03 0.95 mean qi
## 4 bfs - dfs efficiency 0.669 0.0758 1.24 0.95 mean qi
## 5 bfs - dfs overall 0.343 0.189 0.508 0.5 mean qi
## 6 bfs - dfs utility 0.533 0.348 0.727 0.5 mean qi
## 7 bfs - dfs ease.of.use 0.580 0.432 0.720 0.5 mean qi
## 8 bfs - dfs efficiency 0.669 0.470 0.871 0.5 mean qi
combined_oracle_differences <- rbind(
oracle_differences$confidence_udata$differences,
oracle_differences$confidence_ans$differences,
oracle_differences$efficiency$differences,
oracle_differences$ease_of_use$differences,
oracle_differences$utility$differences,
oracle_differences$overall$differences)
combined_oracle_differences$metric <- factor(combined_oracle_differences$metric, levels=rev(analyses))
combined_oracle_differences_confidence <- subset(combined_oracle_differences, metric %in% confidence_metrics)
oracle_differences_plot_confidence <- combined_oracle_differences_confidence %>%
ggplot(aes(x = difference, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_oracle_differences_confidence[1,'oracle'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
oracle_differences_plot_confidence
View intervals
fit_info_oracle_differences_confidence <- combined_oracle_differences_confidence %>% group_by(oracle, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_oracle_differences_confidence
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - compa… confidence.… 0.140 -0.144 0.439 0.95 mean qi
## 2 dziban - compa… confidence.… 0.203 -0.129 0.530 0.95 mean qi
## 3 dziban - compa… confidence.… 0.140 0.0379 0.235 0.5 mean qi
## 4 dziban - compa… confidence.… 0.203 0.0909 0.311 0.5 mean qi
combined_oracle_differences_preference <- subset(combined_oracle_differences, metric %in% preference_metrics)
oracle_differences_plot_preference <- combined_oracle_differences_preference %>%
ggplot(aes(x = difference, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_oracle_differences_preference[1,'oracle'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
oracle_differences_plot_preference
View intervals
fit_info_oracle_differences_preference <- combined_oracle_differences_preference %>% group_by(oracle, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_oracle_differences_preference
## # A tibble: 8 x 8
## # Groups: oracle [1]
## oracle metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - compass… overall 0.220 -0.265 0.697 0.95 mean qi
## 2 dziban - compass… utility 0.312 -0.227 0.841 0.95 mean qi
## 3 dziban - compass… ease.of.u… 0.0874 -0.364 0.561 0.95 mean qi
## 4 dziban - compass… efficiency 0.261 -0.303 0.826 0.95 mean qi
## 5 dziban - compass… overall 0.220 0.0530 0.386 0.5 mean qi
## 6 dziban - compass… utility 0.312 0.129 0.492 0.5 mean qi
## 7 dziban - compass… ease.of.u… 0.0874 -0.0682 0.242 0.5 mean qi
## 8 dziban - compass… efficiency 0.261 0.0682 0.455 0.5 mean qi
combined_alg_differences <- rbind(
alg_differences$confidence_udata$differences,
alg_differences$confidence_ans$differences,
alg_differences$efficiency$differences,
alg_differences$ease_of_use$differences,
alg_differences$utility$differences,
alg_differences$overall$differences)
combined_alg_differences$metric <- factor(combined_alg_differences$metric, levels=rev(analyses))
combined_alg_differences_confidence <- subset(combined_alg_differences, metric %in% confidence_metrics)
alg_differences_plot_confidence <- combined_alg_differences_confidence %>%
ggplot(aes(x = difference, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_alg_differences_confidence[1,'alg'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
alg_differences_plot_confidence
View intervals
fit_info_alg_differences_confidence <- combined_alg_differences_confidence %>% group_by(alg, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_alg_differences_confidence
## # A tibble: 4 x 8
## # Groups: alg [1]
## alg metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs compassql -… confidence… -0.0757 -0.5 0.338 0.95 mean qi
## 2 dfs compassql -… confidence… -0.242 -0.721 0.235 0.95 mean qi
## 3 dfs compassql -… confidence… -0.0757 -0.210 0.0588 0.5 mean qi
## 4 dfs compassql -… confidence… -0.242 -0.397 -0.0882 0.5 mean qi
combined_alg_differences_preference <- subset(combined_alg_differences, metric %in% preference_metrics)
alg_differences_plot_preference <- combined_alg_differences_preference %>%
ggplot(aes(x = difference, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_alg_differences_preference[1,'alg'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
alg_differences_plot_preference
View intervals
fit_info_alg_differences_preference <- combined_alg_differences_preference %>% group_by(alg, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_alg_differences_preference
## # A tibble: 8 x 8
## # Groups: alg [1]
## alg metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs compassql - b… overall -0.546 -1.22 0.132 0.95 mean qi
## 2 dfs compassql - b… utility -0.820 -1.53 -0.0588 0.95 mean qi
## 3 dfs compassql - b… ease.of.… -0.648 -1.28 -0.0294 0.95 mean qi
## 4 dfs compassql - b… efficien… -0.903 -1.68 -0.0732 0.95 mean qi
## 5 dfs compassql - b… overall -0.546 -0.779 -0.309 0.5 mean qi
## 6 dfs compassql - b… utility -0.820 -1.07 -0.574 0.5 mean qi
## 7 dfs compassql - b… ease.of.… -0.648 -0.868 -0.426 0.5 mean qi
## 8 dfs compassql - b… efficien… -0.903 -1.19 -0.632 0.5 mean qi
combined_participant_group_differences <- rbind(
participant_group_differences$confidence_udata$differences,
participant_group_differences$confidence_ans$differences,
participant_group_differences$efficiency$differences,
participant_group_differences$ease_of_use$differences,
participant_group_differences$utility$differences,
participant_group_differences$overall$differences)
combined_participant_group_differences$metric <- factor(combined_participant_group_differences$metric, levels=rev(analyses))
combined_participant_group_differences_confidence <- subset(combined_participant_group_differences, metric %in% confidence_metrics)
participant_group_differences_plot_confidence <- combined_participant_group_differences_confidence %>%
ggplot(aes(x = difference, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_participant_group_differences_confidence[1,'participant_group'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
participant_group_differences_plot_confidence
View intervals
fit_info_participant_group_differences_confidence <- combined_participant_group_differences_confidence %>% group_by(participant_group, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_participant_group_differences_confidence
## # A tibble: 4 x 8
## # Groups: participant_group [1]
## participant_group metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 student - profess… confiden… 0.00390 -0.163 0.172 0.95 mean qi
## 2 student - profess… confiden… 0.00481 -0.139 0.150 0.95 mean qi
## 3 student - profess… confiden… 0.00390 -0.0538 0.0611 0.5 mean qi
## 4 student - profess… confiden… 0.00481 -0.0447 0.0534 0.5 mean qi
combined_participant_group_differences_preference <- subset(combined_participant_group_differences, metric %in% preference_metrics)
participant_group_differences_plot_preference <- combined_participant_group_differences_preference %>%
ggplot(aes(x = difference, y = metric)) +
ylab("Confidence") +
xlab(paste0("Expected Difference in Rating (",combined_participant_group_differences_preference[1,'participant_group'],")")) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
participant_group_differences_plot_preference
View intervals
fit_info_participant_group_differences_preference <- combined_participant_group_differences_preference %>% group_by(participant_group, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_participant_group_differences_preference
## # A tibble: 8 x 8
## # Groups: participant_group [1]
## participant_group metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 student - professi… overall 0.00821 -0.155 0.182 0.95 mean qi
## 2 student - professi… utility -0.00242 -0.242 0.238 0.95 mean qi
## 3 student - professi… ease.of… 0.000685 -0.189 0.198 0.95 mean qi
## 4 student - professi… efficie… 0.00465 -0.213 0.237 0.95 mean qi
## 5 student - professi… overall 0.00821 -0.0505 0.0649 0.5 mean qi
## 6 student - professi… utility -0.00242 -0.0841 0.0780 0.5 mean qi
## 7 student - professi… ease.of… 0.000685 -0.0659 0.0665 0.5 mean qi
## 8 student - professi… efficie… 0.00465 -0.0702 0.0798 0.5 mean qi